In [1]:

    
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *



In [2]:

    
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]



In [3]:

    
# Loading in the two corpuses
notebooks = [os.path.join('../hw_corpus', fname) for fname in os.listdir('../hw_corpus')]
hw_notebook_objs = [NotebookMiner(file) for file in notebooks]



In [4]:

    
person_to_notebooks = {}
for nb in hw_notebook_objs:
    person = nb.filename.split('/')[2].split('_')[0]
    if person not in person_to_notebooks:
        person_to_notebooks[person] = []
    person_to_notebooks[person].append(nb)



In [5]:

    
print(len([key for key in person_to_notebooks.keys()]))



In [6]:

    
print(len(os.listdir('../testbed/Final')))

Looks like there are 176 students in the Final directory and only 56 in the homework directory. Furthermore, there were actually 60 repos in the hw_corpus... 4 apparently have no notebooks. Representative example: cyriaquebrousse



In [7]:

    
max_hw_notebook_objs = []
for key in person_to_notebooks.keys():
    cur_max = 0
    max_nb = None
    for nb in person_to_notebooks[key]:
        if nb.get_number_cells() > cur_max:
            cur_max = nb.get_number_cells()
            max_nb = nb
    max_hw_notebook_objs.append(max_nb)



In [8]:

    
from nbminer.stats.multiple_summary import MultipleSummary
hw_summary = MultipleSummary(max_hw_notebook_objs)
final_summary = MultipleSummary(notebook_objs)
print("Number of Final notebooks: ", len(final_summary.summary_vec))
print("Number of Homework notebooks: ", len(hw_summary.summary_vec))
print("Average number of cells, Final: ", final_summary.average_number_of_cells())
print("Average number of cells, Homework: ", hw_summary.average_number_of_cells())
print("Average lines of code, Final: ", final_summary.average_lines_of_code())
print("Average lines of code, Homework: ", hw_summary.average_lines_of_code())









    



Number of Final notebooks:  177
Number of Homework notebooks:  56
Average number of cells, Final:  68.92090395480226
Average number of cells, Homework:  79.30357142857143
Average lines of code, Final:  271.3502824858757
Average lines of code, Homework:  392.69642857142856



In [9]:

    
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
a = Features(max_hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 100)
#agr = ASTGraphReducer(a, threshold=20, split_call=False)
njs = NotebookJaccardSimilarity()
pipe = Pipeline([gastf, rbn, gi, fe, ke, njs])
a = pipe.transform(a)









    



<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x151dc5e630>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x10733dd68>
<nbminer.preprocess.get_imports.GetImports object at 0x151c32d518>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x151c32db00>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x151c32d390>
<nbminer.results.similarity.jaccard_similarity.NotebookJaccardSimilarity object at 0x151dad7f60>



In [10]:

    
import numpy as np
intra, inter = njs.group_average_jaccard_similarity('group_1')
print('Mean within group: ', np.mean(np.array(intra)))
print('STD within group: ', np.std(np.array(intra)))
print('Mean outside group: ', np.mean(np.array(inter)))
print('STD outside group: ', np.std(np.array(inter)))









    



Mean within group:  0.263525329937
STD within group:  0.0392584417538
Mean outside group:  0.243111132076
STD outside group:  0.0359253606179



In [11]:

    
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
a = Features(max_hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 100)
#agr = ASTGraphReducer(a, threshold=20, split_call=False)
ci = CorpusIdentifier()
pipe = Pipeline([gastf, rbn, gi, fe, ke, ci])
a = pipe.transform(a)









    



<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x10a83e9b0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1a305eda58>
<nbminer.preprocess.get_imports.GetImports object at 0x1a305edcf8>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1a30d8eb00>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a30d8e7f0>
<nbminer.results.prediction.corpus_identifier.CorpusIdentifier object at 0x1a30e812e8>



In [66]:

    
%matplotlib inline
import matplotlib.pyplot as plt
fpr, tpr, m = ci.predict()
print(m)
plt.plot(fpr, tpr)









    



0.321249430005






    Out[66]:





[<matplotlib.lines.Line2D at 0x1a380bb198>]



In [49]:

    
template_counter = {'group_1':{}, 'group_2':{}}
for i in range(a.get_number_notebooks()):
    group = a.get_notebook(i).get_feature('import_name')
    for seg in a.get_notebook_segments(i):
        templ = seg.get_feature('template')
        if templ != None:
            if templ not in template_counter[group]:
                template_counter[group][templ] = 0
            template_counter[group][templ] += 1



In [58]:

    
for key in template_counter['group_1'].keys():
    print (template_counter['group_1'][key], template_counter['group_2'][key])



In [67]:

    
percentages = []
total_sum_1 = 0
total_sum_2 = 0
for key in template_counter['group_1'].keys():
    v1 = template_counter['group_1'][key]
    v2 = template_counter['group_2'][key]
    total_sum_1 += v1
    total_sum_2 += v2
    arr = (v1/(v1+v2), key, v1+v2)
    percentages.append(arr)



In [68]:

    
print(total_sum_1/(total_sum_1+total_sum_2))









    



0.3661901685242018



In [47]:

    
# Smaller number -- more likely in group 2 (aka, final)
for el in sorted(percentages):
    if el[2] > 20:
        print(el)
        print (astor.to_source(ke.templates.get_random_example(el[1])))









    



(0.3333333333333333, 'template_70', 582)
var().magic('matplotlib inline')

(0.3333333333333333, 'template_89', 201)
sns.barplot(x='month', y='favorite_count', data=var)

(0.33458646616541354, 'template_45', 266)
var = [var.doc2bow(var) for var in var]

(0.33482142857142855, 'template_64', 224)
var = gensim.corpora.Dictionary(var)

(0.3350253807106599, 'template_68', 197)
var = plt.subplot(224)

(0.3352165725047081, 'template_28', 1593)
var = var.groupby(['hour'])[var].sum().reset_index()

(0.3357664233576642, 'template_77', 137)
var = RandomForestRegressor()

(0.3378803777544596, 'template_22', 953)
var = pd.read_json('eth_en.json')

(0.34054054054054056, 'template_73', 185)
var = LinearRegression()

(0.34080717488789236, 'template_52', 223)
var, var, var, var = train_test_split(var, var, test_size=0.4, random_state=415
    )

(0.3422680412371134, 'template_8', 970)
sum([(1) for var, var in var.items() if not var])

(0.3441933788754598, 'template_35', 1903)
plt.plot(var, var, label='EPFL')

(0.34444444444444444, 'template_29', 90)
var = var.agg(var)

(0.3448275862068966, 'template_75', 29)
def get_rt_and_fav_sum_of_hashtags(data):
    var = var.copy()
    var['hashtags'] = var['entities'].apply(var)
    var = var[['hashtags', 'retweet_count', 'favorite_count']]
    var = pd.DataFrame(var(var), columns=['hashtag', 'rt', 'fav'])
    var = []
    for var, var in var.groupby('hashtag'):
        var.append({'hashtag': var, 'rt': var.rt.sum(), 'fav': var.fav.sum()})
    return pd.DataFrame(var)

(0.3448773448773449, 'template_9', 693)
var.describe()

(0.34545454545454546, 'template_7', 1595)
var.drop('contributors', axis=1, inplace=True)

(0.3465346534653465, 'template_26', 303)
var[1, 1].set_title('ETHZ retweets per month')

(0.3475336322869955, 'template_24', 446)
var.legend(labels=['EPFL', 'ETH'])

(0.34782608695652173, 'template_14', 46)
def get_hashtags(df):
    var = []
    for var, var in var.iterrows():
        for var in var['entities'].get('hashtags'):
            var = var.get('text')
            if var not in var:
                var.append(var)
    return var

(0.3484848484848485, 'template_94', 66)
def add_year_month_hour(df):
    var['year'] = var.apply(var, axis=1)
    var['month'] = var.apply(var, axis=1)
    var['hour'] = var.apply(var, axis=1)

(0.3486590038314176, 'template_98', 261)
var = pd.to_datetime(var.created_at)

(0.34894613583138173, 'template_15', 427)
var = var.dropna()

(0.3492063492063492, 'template_4', 63)
var['created_at'].resample('A').count().plot(marker='o', color='r')

(0.34951456310679613, 'template_34', 103)
for var in var:
    var.append(str(var).split()[0].split('-')[1])

(0.34953703703703703, 'template_61', 432)
plt.hist(var, bins=var)

(0.3498233215547703, 'template_55', 283)
var.set_ylabel('Number', fontsize=12)

(0.35, 'template_67', 40)
def trend_by_year(df):
    var['year'] = var['created_at'].map(lambda x: var.year)
    var = var.groupby('year')
    var['id'].count().plot(kind='bar')
    plt.title('Number of tweets per year')
    plt.show()
    var['retweet_count'].sum().plot(kind='bar')
    plt.title('Number of retweets per year')
    plt.show()
    var['favorite_count'].sum().plot(kind='bar')
    plt.title('Number of favorites per year')
    plt.show()

(0.3511111111111111, 'template_33', 675)
var.fit(var, var)

(0.35135135135135137, 'template_12', 629)
var = var.bar(var + 0.2, var, width=0.2, color='r', align='center')

(0.3516949152542373, 'template_37', 236)
var.set_xlabel('Year', fontsize=12)

(0.353030303030303, 'template_53', 660)
plt.title("""EPFL: Histogram of Retweet Counts 
 Average per tweet: %.4s""" %
    (var.retweet_count.sum() / len(var)))

(0.3531468531468531, 'template_79', 286)
plt.subplots(figsize=(8, 6))

(0.35372848948374763, 'template_23', 1046)
plt.show()

(0.3544973544973545, 'template_63', 189)
var = sklearn.model_selection.cross_val_score(var, var, var, cv=10, scoring
    ='neg_mean_squared_error')

(0.3561643835616438, 'template_91', 146)
var = sorted(var.items(), key=operator.itemgetter(1), reverse=True)

(0.35714285714285715, 'template_80', 126)
plt.tight_layout()

(0.35807860262008734, 'template_74', 458)
plt.xlabel('Value of retweet count')

(0.3580819798917247, 'template_11', 2586)
var = var.Date.apply(lambda s: var - var.Date.tail(1)).values / (1000 * 
    1000 * 1000 * 60 * 60)

(0.35833333333333334, 'template_65', 120)
plt.suptitle('ETH months')

(0.3584905660377358, 'template_97', 212)
plt.xticks(())

(0.35877862595419846, 'template_87', 262)
var['hashtags'] = var.text.apply(lambda x: ' '.join([var.lower() for var in
    var.split() if '#' in var]))

(0.359375, 'template_72', 320)
np.all(var.isnull(), axis=1)

(0.36026490066225164, 'template_6', 755)
var = var.groupby(var['created_at'].dt.year)

(0.3603290098070231, 'template_1', 3161)
print(var)

(0.36036036036036034, 'template_27', 555)
plt.ylabel('# of Tweets')

(0.36054421768707484, 'template_66', 294)
var = set(stopwords.words('english'))

(0.3605683836589698, 'template_76', 563)
var.Race.value_counts()

(0.36082474226804123, 'template_51', 291)
print('Mean squared error: %.2f' % np.mean((var.predict(var) - var) ** 2))

(0.361198738170347, 'template_44', 634)
var = var.copy()

(0.3617021276595745, 'template_86', 94)
var['year'] = pd.Series([0] * var.shape[0], index=var.index)

(0.3619631901840491, 'template_56', 326)
var = pd.concat([var, var], axis=1)

(0.3629807692307692, 'template_38', 416)
var['created_at'] = var['created_at'].map(lambda x: var.hour)

(0.363265306122449, 'template_60', 245)
var.origin.unique()

(0.36335403726708076, 'template_41', 322)
var['year'] = [var.year for var in var.created_at.tolist()]

(0.36348949919224555, 'template_59', 619)
for var in var:
    if var % 500 == 0:
        print(var)
    var += 1
    var.append(var(var))

(0.3641304347826087, 'template_81', 184)
var.sample(5)

(0.36507936507936506, 'template_99', 315)
var = var.groupby(by=var.columns, axis=1, level=0).agg(sum)

(0.36554621848739494, 'template_31', 476)
plt.figure()

(0.36633663366336633, 'template_62', 101)
for var, var in var.items():
    print('***')
    print(var)
    print('Number of favorites: ' + str(var['favorite_count'].sum()))
    print('Number of retweets: ' + str(var['retweet_count'].sum()))
    print('\n')

(0.36685288640595903, 'template_21', 537)
def feature_importance(data, labels):
    var = ExtraTreesRegressor(n_estimators=250, random_state=0)
    var.fit(var, var)
    var = var.feature_importances_
    var = np.std([var.feature_importances_ for var in var.estimators_], axis=0)
    var = np.argsort(var)[::-1]
    print('Feature ranking:')
    for var in range(var.shape[1]):
        print('%d. feature %d (%f)' % (var + 1, var[var], var[var[var]]))
    plt.figure()
    plt.title('Feature importances')
    plt.bar(range(var.shape[1]), var[var], color='r', yerr=var[var], align=
        'center')
    plt.xticks(range(var.shape[1]), var)
    plt.xlim([-1, var.shape[1]])
    plt.show()

(0.36915224145583664, 'template_0', 22530)
var['hour'] = var.created_at.dt.hour

(0.36933385888845094, 'template_2', 5074)
var(var)

(0.37037037037037035, 'template_54', 27)
def normalize_text(text):
    var = re.sub(
        '((www\\.[^\\s]+)|(https?://[^\\s]+)|(pic\\.twitter\\.com/[^\\s]+))',
        '', var)
    var = re.sub('@[^\\s]+', '', var)
    var = re.sub('#([^\\s]+)', '', var)
    var = re.sub('[:;>?<=*+()/,\\-#!$%\\{˜|\\}\\[^_\\@\\]1234567890’‘]',
        ' ', var)
    var = re.sub('[\\d]', '', var)
    var = var.replace('.', '')
    var = var.replace("'", ' ')
    var = var.replace('"', ' ')
    return var

(0.37058823529411766, 'template_92', 170)
plt.imshow(var)

(0.37104072398190047, 'template_69', 221)
var = WordCloud().generate(var)

(0.3714851485148515, 'template_10', 2525)
var.head()

(0.378698224852071, 'template_20', 676)
pd.DataFrame(var, columns=['Favorites per tweet', 'Retweets per tweet']).plot(
    kind='bar')

(0.3811944091486658, 'template_58', 787)
for var in var.index:
    var = var.loc[var]['entities']['hashtags']
    if len(var) == 0:
        continue
    for var in var:
        if var['text'] not in var:
            var[var['text']] = {'retweet_count': 0, 'favorite_count': 0}
        var[var['text']]['retweet_count'] += var.loc[var]['retweet_count']
        var[var['text']]['favorite_count'] += var.loc[var]['favorite_count']

(0.3835616438356164, 'template_83', 73)
var = folium.Map(location=[np.mean(var.placeLatitude), np.mean(var.
    placeLongitude)], tiles='Cartodb Positron', zoom_start=6)

(0.3838094050143445, 'template_93', 8017)
var.axis('off')

(0.38608695652173913, 'template_90', 575)
print(len(var))

(0.3870967741935484, 'template_57', 341)
var.set_index(['createdAt'], inplace=True)

(0.3881118881118881, 'template_47', 286)
var.set_xticklabels(var.index.astype(str))

(0.3888888888888889, 'template_16', 162)
for var in var.index.values:
    var = var.index.str.contains(var, case=False, na=False)
    if len(var[var].index) == 1:
        var = np.append(var, var.index.get_loc(var[var].index[0]))
        var = np.append(var, var)

(0.391304347826087, 'template_36', 23)
var = [var('bahnhofstrasse', 47.377, 8.54, 2005, -1)[0]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2006, -1)[5]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2007, -1)[2]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2008, -1)[9]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2009, -1)[4]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2010, -1)[2]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2011, -1)[0]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2012, -1)[0]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2013, -1)[1]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2014, -1)[9]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2015, -1)[5]['url_m'], var(
    'bahnhofstrasse', 47.377, 8.54, 2016, -1)[54]['url_m']]

(0.39263803680981596, 'template_78', 326)
var.created_at.groupby(var.created_at.dt.year).count().plot(kind='bar')

(0.3942307692307692, 'template_49', 208)
var.isnull().sum().append(var.isnull().sum())

(0.40214477211796246, 'template_25', 373)
var.reset_index(inplace=True)

(0.40350877192982454, 'template_82', 228)
var = var.sort_values(axis=0, ascending=False)[var > 15].index[1:]

(0.4035532994923858, 'template_40', 394)
var['n_at'] = [len(list(set({var.strip('@') for var in var.split() if var.
    startswith('#')}))) for var in var.text]

(0.40555555555555556, 'template_18', 360)
var = open('text_eth.csv', 'r', encoding='utf-8').readlines()

(0.4074074074074074, 'template_96', 27)
def format_individual_global_voting_profile(voting_unique):
    var = var.set_index(['ParlGroupName', 'Name'])[['Decision']]
    var = lambda x: np.sum(var == 1) / len(var)
    var = lambda x: np.sum(var == 2) / len(var)
    var = lambda x: np.sum(var == 3) / len(var)
    var = var.groupby(level=['ParlGroupName', 'Name']).agg({'Decision': {
        'Yes': var, 'No': var, 'Abstention': var}})
    var.columns = var.columns.droplevel(0)
    return var

(0.4084507042253521, 'template_85', 142)
var = list(map(lambda city: [var.Canton.values[0], str(var.Longitude.values
    [0]) + ',' + str(var.Latitude.values[0])], var))

(0.44166666666666665, 'template_30', 120)
for var, var in var.groupby('radio'):
    var = var.reset_index().drop(['date', 'tags', 'radio'], axis=1)
    var['artists'] = var['artists'].apply(lambda x: ', '.join(var))
    var = 0
    var = []
    for var, var in var.groupby('artists'):
        var = var + 1
        var.append(var['track'].unique().size)
    var = numpy.mean(var)
    print(var + ': ' + str(var) + ' artists, with ' + str(round(var, 1)) +
        ' mean unique songs per artist')

(0.47752808988764045, 'template_71', 178)
var.to_csv('emotion_season_data_LexiconBasedApproach/spring5.csv', index=None)

(0.5, 'template_48', 218)
var = pd.read_csv(var, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE,
    header=None, escapechar='\\', na_values='N', names=var)



In [28]:

    
import astor
print(astor.to_source(ke.templates.get_random_example('template_89')))
print(astor.to_source(ke.templates.get_random_example('template_64')))
print(astor.to_source(ke.templates.get_random_example('template_73')))
print(astor.to_source(ke.templates.get_random_example('template_5')))









    



var = sns.barplot(var.index, var.favorite_count)

var = corpora.Dictionary(var)

var = LinearRegression(n_jobs=-1)



In [ ]:



In [ ]: